# lst = [results1,results2,results3,results4,results5,results6,results7,results8]
# pd.concat(lst)
정리
구분 | Throw(fraud_rate) | split(test_fraud_rate) | theta | gamma | 비고 |
---|---|---|---|---|---|
시도1 | 0.3 | 0.05 | 8.028000e+04 | 0.3 | |
시도2 | 0.3 | 0.05 | 8.028000e+04 | 0.2 | |
시도3 | 0.3 | 0.05 | 9.028000e+04 | 0.2 | |
시도4 | 0.3 | 0.05 | 7.028000e+04 | 0.2 | |
시도5 | 0.3 | 0.005 | 7.028000e+04 | 0.3 | |
시도6 | 0.3 | 0.005 | 8.028000e+04 | 0.2 | |
시도7 | 0.3 | 0.005 | 9.028000e+04 | 0.2 | |
시도8 | 0.3 | 0.005 | 7.028000e+04 | 0.2 |
c
accuracy_score | precision_score | recall_score | f1_score | roc_auc_score | |
---|---|---|---|---|---|
1 | 0.970862 | 0.639821 | 0.953333 | 0.765730 | 0.962559 |
2 | 0.968365 | 0.619565 | 0.950000 | 0.750000 | 0.959665 |
3 | 0.970030 | 0.635135 | 0.940000 | 0.758065 | 0.955804 |
4 | 0.967865 | 0.616558 | 0.943333 | 0.745718 | 0.956244 |
5 | 0.969530 | 0.627451 | 0.960000 | 0.758893 | 0.965016 |
6 | 0.970196 | 0.635955 | 0.943333 | 0.759732 | 0.957471 |
7 | 0.970529 | 0.633987 | 0.970000 | 0.766798 | 0.970279 |
8 | 0.968531 | 0.618337 | 0.966667 | 0.754226 | 0.967648 |
imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import xgboost as xgb
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
def throw(df, fraud_rate): # 사기 거래 비율에 맞춰 버려지는 함수!
= df[df['is_fraud'] == 1].copy()
df1 = df[df['is_fraud'] == 0].copy()
df0 = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
df0_downsample = df0.sample(frac=df0_downsample, random_state=42)
df0_down = pd.concat([df1, df0_down])
df_p return df_p
def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3):
= len(data_frame)
n
# 사기 거래와 정상 거래를 분리
= data_frame[data_frame['is_fraud'] == 1]
fraud_data = data_frame[data_frame['is_fraud'] == 0]
normal_data
# 테스트 데이터 크기 계산
= int(test_fraud_rate * (n * test_rate))
test_samples = int(n * test_rate) - test_samples
remaining_test_samples
# 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
= fraud_data.sample(n=test_samples, replace=False)
test_fraud_data = normal_data.sample(n=remaining_test_samples, replace=False)
test_normal_data
# 테스트 데이터 합치기
= pd.concat([test_normal_data, test_fraud_data])
test_data
# 훈련 데이터 생성
= data_frame[~data_frame.index.isin(test_data.index)]
train_data
return train_data, test_data
def concat(df_tr, df_tst):
= pd.concat([df_tr, df_tst])
df = np.concatenate((np.full(len(df_tr), True), np.full(len(df_tst), False))) # index꼬이는거 방지하기 위해서? ★ (이거,, 훔,,?(
train_mask = np.concatenate((np.full(len(df_tr), False), np.full(len(df_tst), True)))
test_mask = (train_mask, test_mask)
mask return df, mask
def evaluation(y, yhat):
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score,
sklearn.metrics.roc_auc_score]return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})
def compute_time_difference(group):
= len(group)
n = []
result for i in range(n):
for j in range(n):
= abs((group.iloc[i].trans_date_trans_time - group.iloc[j].trans_date_trans_time).total_seconds())
time_difference
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])return result
def edge_index(df, unique_col, theta, gamma):
= df.groupby(unique_col)
groups = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
edge_index = edge_index.astype(np.float64)
edge_index # filename = f"edge_index{str(unique_col).replace(' ', '').replace('_', '')}.npy" # 저장
# np.save(filename, edge_index)
2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
edge_index[:,= torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
edge_index return edge_index
def gcn_data(df):
= torch.tensor(df['amt'].values, dtype=torch.float).reshape(-1,1)
x = torch.tensor(df['is_fraud'].values,dtype=torch.int64)
y = torch_geometric.data.Data(x=x, edge_index = edge_index, y=y, train_mask = mask[0], test_mask= mask[1])
data return data
class GCN1(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(1, 32)
self.conv2 = GCNConv(32, 2)
def forward(self, data):
= data.x, data.edge_index
x, edge_index
= self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
x
return F.log_softmax(x, dim=1)
def train_and_evaluate_model(data, model, optimizer, num_epochs=400):
model.train()for epoch in range(num_epochs):
optimizer.zero_grad()= model(data)
out = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss
loss.backward()
optimizer.step()
eval()
model.= model(data).argmax(dim=1)
pred = pred[data.test_mask]
yyhat
return yyhat
# # 모델과 옵티마이저 생성
# model = GCN1()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
# # 함수 호출
# yyhat = train_and_evaluate_model(data, model, optimizer)
= pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time))) fraudTrain
(throw 0.3 /split 0.05)
= throw(fraudTrain, 0.3)
df = split_dataframe(df, 0.05)
df_tr, df_tst = concat(df_tr, df_tst)
df2, mask 'index'] = df2.index
df2[= df2.reset_index() df3
df3.is_fraud.mean()
0.3
df_tst.is_fraud.mean()
0.04995004995004995
시도1
= edge_index(df3,'cc_num', 8.028000e+04, 0.3)
edge_index = gcn_data(df3)
data = GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer = (data.y[data.test_mask]).numpy()
yy = train_and_evaluate_model(data, model, optimizer)
yyhat =evaluation(yy,yyhat) results1
시도2
= edge_index(df3,'cc_num', 8.028000e+04, 0.2)
edge_index = gcn_data(df3)
data = GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer = (data.y[data.test_mask]).numpy()
yy = train_and_evaluate_model(data, model, optimizer)
yyhat =evaluation(yy,yyhat) results2
시도3
= edge_index2(df3,'cc_num', 9.028000e+04, 0.2)
edge_index = gcn_data(df3)
data = GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer = (data.y[data.test_mask]).numpy()
yy = train_and_evaluate_model(data, model, optimizer)
yyhat =evaluation(yy,yyhat) results3
시도4
= edge_index2(df3,'cc_num', 7.028000e+04, 0.2)
edge_index = gcn_data(df3)
data = GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer = (data.y[data.test_mask]).numpy()
yy = train_and_evaluate_model(data, model, optimizer)
yyhat =evaluation(yy,yyhat) results4
(throw 0.3 /split 0.005)
= throw(fraudTrain, 0.3)
df = split_dataframe(df, 0.005)
df_tr, df_tst = concat(df_tr, df_tst)
df2, mask 'index'] = df2.index
df2[= df2.reset_index() df3
df3.is_fraud.mean()
0.3
df_tst.is_fraud.mean()
0.004995004995004995
시도5
= edge_index(df3,'cc_num', 8.028000e+04, 0.3)
edge_index = gcn_data(df3)
data = GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer = (data.y[data.test_mask]).numpy()
yy = train_and_evaluate_model(data, model, optimizer)
yyhat =evaluation(yy,yyhat) results5
시도6
= edge_index(df3,'cc_num', 8.028000e+04, 0.2)
edge_index = gcn_data(df3)
data = GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer = (data.y[data.test_mask]).numpy()
yy = train_and_evaluate_model(data, model, optimizer)
yyhat =evaluation(yy,yyhat) results6
시도7
= edge_index2(df3,'cc_num', 9.028000e+04, 0.2)
edge_index = gcn_data(df3)
data = GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer = (data.y[data.test_mask]).numpy()
yy = train_and_evaluate_model(data, model, optimizer)
yyhat =evaluation(yy,yyhat) results7
시도8
= edge_index2(df3,'cc_num', 7.028000e+04, 0.2)
edge_index = gcn_data(df3)
data = GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer = (data.y[data.test_mask]).numpy()
yy = train_and_evaluate_model(data, model, optimizer)
yyhat =evaluation(yy,yyhat) results8